#This script downloads the raw data from the Dropbox link and extracts it to the data/raw directory
#It is not be included for the final version
#This script is only used to download the data for the first time
#Downloading the data takes a considerable amount of time
#As such the processed parquet files (generated in 01_write_and_sample_parquets.py) are already included in the data/output/combined directory
import requests
import zipfile
import io
import os
from tqdm import tqdm

def download_and_extract_zip(url, extract_to):
    # Ensure the destination directory exists
    os.makedirs(extract_to, exist_ok=True)
    
    print("Downloading ZIP file...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 KiB per block
        file_content = io.BytesIO()
        
        # Download with a progress bar
        with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as pbar:
            for data in response.iter_content(block_size):
                file_content.write(data)
                pbar.update(len(data))
        
        # Reset pointer to the beginning of the BytesIO object
        file_content.seek(0)
        print("Download successful. Extracting files...")
        
        # Extract the ZIP file
        with zipfile.ZipFile(file_content) as zip_ref:
            zip_ref.extractall(extract_to)
        print("Extraction complete!")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

if __name__ == "__main__":
    # Dropbox URL with dl=1 to force direct download of a ZIP file
    dropbox_url = "" #Provided upon request for purposes of replication only
    
    # Destination path where you want the directory to be extracted
    destination_path = "data/raw"
    
    download_and_extract_zip(dropbox_url, destination_path)
